#pragma once
#include "SHCoeff.hpp"
#include <xmmintrin.h>

namespace zzz{

class SHCoeff4f_SSE{
public:
  union{
    float __declspec(align(16)) v[16];
    struct {__m128 v0,v1,v2,v3;};
  };
public:
  //inline functions
  inline const SHCoeff4f_SSE& operator=(const float *data)
  {
    v[0]=data[0];
    v[1]=data[1];
    v[2]=data[2];
    v[3]=data[3];
    v[4]=data[4];
    v[5]=data[5];
    v[6]=data[6];
    v[7]=data[7];
    v[8]=data[8];
    v[9]=data[9];
    v[10]=data[10];
    v[11]=data[11];
    v[12]=data[12];
    v[13]=data[13];
    v[14]=data[14];
    v[15]=data[15];

    return *this;
  }
  inline const SHCoeff4f_SSE& operator=(const SHCoeff4f &data)
  {
    v[0]=data.v[0];
    v[1]=data.v[1];
    v[2]=data.v[2];
    v[3]=data.v[3];
    v[4]=data.v[4];
    v[5]=data.v[5];
    v[6]=data.v[6];
    v[7]=data.v[7];
    v[8]=data.v[8];
    v[9]=data.v[9];
    v[10]=data.v[10];
    v[11]=data.v[11];
    v[12]=data.v[12];
    v[13]=data.v[13];
    v[14]=data.v[14];
    v[15]=data.v[15];

    return *this;
  }
  inline const SHCoeff4f_SSE& operator=(const SHCoeff4f_SSE& coef)
  {
    /*///  Assembly code

    _asm
    {
      mov eax, coef          //move &coef to eax, coef is a pointer as to assembler
      mov edx, this          //though "this" is always stored in ecx for thiscall functions
      //inline makes the call not guaranteed
      //coef to register
      movaps xmm0, [eax]coef.v    //all floats are moved to register before moved to target
      movaps xmm1, [eax]coef.v+16    //to increase pipeline parallelity
      movaps xmm2, [eax]coef.v+32
      movaps xmm3, [eax]coef.v+48

      //register to this
      movaps [edx]this.v, xmm0
      movaps [edx]this.v+16, xmm1
      movaps [edx]this.v+32, xmm2
      movaps [edx]this.v+48, xmm3
    }

/*///  Equivalent SSE intrinsic code

*((__m128 *)(v))=_mload_ps_(coef.v);
*((__m128 *)(v+4))=_mload_ps_(coef.v+4);
*((__m128 *)(v+8))=_mload_ps_(coef.v+8);
*((__m128 *)(v+12))=_mload_ps_(coef.v+12);
//*/

    return *this;
  }
  inline void operator+=(const SHCoeff4f_SSE& coef)
  {
    /*///  Assembly code
    _asm
    {
      mov eax, coef          //move &coef to eax, coef is a pointer as to assembler
      mov edx, this          //though "this" is always stored in ecx for thiscall functions
      //inline makes the call not guaranteed

      //coef to register
      movaps xmm0, [eax]coef.v
      movaps xmm1, [eax]coef.v+16
      movaps xmm2, [eax]coef.v+32
      movaps xmm3, [eax]coef.v+48

      //this to register
      movaps xmm4, [edx]this.v
      movaps xmm5, [edx]this.v+16
      movaps xmm6, [edx]this.v+32
      movaps xmm7, [edx]this.v+48

      //add
      addps xmm4, xmm0
      addps xmm5, xmm1
      addps xmm6, xmm2
      addps xmm7, xmm3

      //register to this
      movaps [edx]this.v, xmm4
      movaps [edx]this.v+16, xmm5
      movaps [edx]this.v+32, xmm6
      movaps [edx]this.v+48, xmm7
    }

/*///  Equivalent SSE intrinsic code

*((__m128 *)(v))=_madd_ps_(*((__m128 *)(v)),*((__m128 *)(coef.v)));
*((__m128 *)(v+4))=_madd_ps_(*((__m128 *)(v+4)),*((__m128 *)(coef.v+4)));
*((__m128 *)(v+8))=_madd_ps_(*((__m128 *)(v+8)),*((__m128 *)(coef.v+8)));
*((__m128 *)(v+12))=_madd_ps_(*((__m128 *)(v+12)),*((__m128 *)(coef.v+12)));
//*/
  }
  inline void operator-=(const SHCoeff4f_SSE& coef)
  {
    /*///  Assembly code
    _asm
    {
      mov eax, coef          //move &coef to eax, coef is a pointer as to assembler
      mov edx, this          //though "this" is always stored in ecx for thiscall functions
      //inline makes the call not guaranteed

      //coef to register
      movaps xmm0, [eax]coef.v
      movaps xmm1, [eax]coef.v+16
      movaps xmm2, [eax]coef.v+32
      movaps xmm3, [eax]coef.v+48

      //this to register
      movaps xmm4, [edx]this.v
      movaps xmm5, [edx]this.v+16
      movaps xmm6, [edx]this.v+32
      movaps xmm7, [edx]this.v+48

      //subtract
      subps xmm4, xmm0
      subps xmm5, xmm1
      subps xmm6, xmm2
      subps xmm7, xmm3

      //register to this
      movaps [edx]this.v, xmm4
      movaps [edx]this.v+16, xmm5
      movaps [edx]this.v+32, xmm6
      movaps [edx]this.v+48, xmm7
    }

/*///  Equivalent SSE intrinsic code

*((__m128 *)(v))=_msub_ps_(*((__m128 *)(v)),*((__m128 *)(coef.v)));
*((__m128 *)(v+4))=_msub_ps_(*((__m128 *)(v+4)),*((__m128 *)(coef.v+4)));
*((__m128 *)(v+8))=_msub_ps_(*((__m128 *)(v+8)),*((__m128 *)(coef.v+8)));
*((__m128 *)(v+12))=_msub_ps_(*((__m128 *)(v+12)),*((__m128 *)(coef.v+12)));
//*/
  }
  inline const SHCoeff4f_SSE operator*(const float scale) const
  {
    /*///  Assembly code
    __m128 r[4];            //use SHCoeff4f_SSE here will cause an extra
    //constructor call
    _asm
    {
      mov edx, this          //though "this" is always stored in ecx for thiscall functions
      //inline makes the call not guaranteed

      //scale to register
      movss xmm0, scale        //move scale to xmm0
      shufps  xmm0, xmm0, 0      //shuffle scale to all 4 floats in xmm0

      //this to register
      movaps xmm1, [edx]this.v
      movaps xmm2, [edx]this.v+16
      movaps xmm3, [edx]this.v+32
      movaps xmm4, [edx]this.v+48

      //multiply
      mulps xmm1, xmm0
      mulps xmm2, xmm0
      mulps xmm3, xmm0
      mulps xmm4, xmm0

      //register to this
      movaps r, xmm1
      movaps r+16, xmm2
      movaps r+32, xmm3
      movaps r+48, xmm4
    }
    return *((SHCoeff4f_SSE *)r);

/*///  Equivalent SSE intrinsic code

__m128 temp=_mload_ps1_(&scale);
*((__m128 *)(v))=_mmul_ps_(*((__m128 *)(v)),temp);
*((__m128 *)(v+4))=_mmul_ps_(*((__m128 *)(v+4)),temp);
*((__m128 *)(v+8))=_mmul_ps_(*((__m128 *)(v+8)),temp);
*((__m128 *)(v+12))=_mmul_ps_(*((__m128 *)(v+12)),temp);
//*/
  }
  inline void operator*=(const float scale)
  {
    /*///  Assembly code
    _asm
    {
      mov edx, this          //though "this" is always stored in ecx for thiscall functions
      //inline makes the call not guaranteed

      //scale to register
      movss xmm0, scale        //move scale to xmm0
      shufps  xmm0, xmm0, 0      //shuffle scale to all 4 floats in xmm0

      //this to register
      movaps xmm1, [edx]this.v
      movaps xmm2, [edx]this.v+16
      movaps xmm3, [edx]this.v+32
      movaps xmm4, [edx]this.v+48

      //multiply
      mulps xmm1, xmm0
      mulps xmm2, xmm0
      mulps xmm3, xmm0
      mulps xmm4, xmm0

      //register to this
      movaps [edx]this.v, xmm1
      movaps [edx]this.v+16, xmm2
      movaps [edx]this.v+32, xmm3
      movaps [edx]this.v+48, xmm4
    }

/*///  Equivalent SSE intrinsic code

__m128 temp=_mload_ps1_(&scale);
*((__m128 *)(v))=_mmul_ps_(*((__m128 *)(v)),temp);
*((__m128 *)(v+4))=_mmul_ps_(*((__m128 *)(v+4)),temp);
*((__m128 *)(v+8))=_mmul_ps_(*((__m128 *)(v+8)),temp);
*((__m128 *)(v+12))=_mmul_ps_(*((__m128 *)(v+12)),temp);
//*/
  }
  inline void operator/=(const float scale)
  {
    /*///  Assembly code
    _asm
    {
      mov edx, this          //though "this" is always stored in ecx for thiscall functions
      //inline makes the call not guaranteed

      //scale to register
      movss xmm0, scale        //move scale to xmm0
      shufps  xmm0, xmm0, 0      //shuffle scale to all 4 floats in xmm0

      //this to register
      movaps xmm1, [edx]this.v
      movaps xmm2, [edx]this.v+16
      movaps xmm3, [edx]this.v+32
      movaps xmm4, [edx]this.v+48

      //divide
      divps xmm1, xmm0
      divps xmm2, xmm0
      divps xmm3, xmm0
      divps xmm4, xmm0

      //register to this
      movaps [edx]this.v, xmm1
      movaps [edx]this.v+16, xmm2
      movaps [edx]this.v+32, xmm3
      movaps [edx]this.v+48, xmm4
    }

/*///  Equivalent SSE intrinsic code

__m128 temp=_mload_ps1_(&scale);
*((__m128 *)(v))=_mdiv_ps_(*((__m128 *)(v)),temp);
*((__m128 *)(v+4))=_mdiv_ps_(*((__m128 *)(v+4)),temp);
*((__m128 *)(v+8))=_mdiv_ps_(*((__m128 *)(v+8)),temp);
*((__m128 *)(v+12))=_mdiv_ps_(*((__m128 *)(v+12)),temp);
//*/
  }
  inline float Dot(const SHCoeff4f_SSE &coef) const
  {
    /*///  Assembly code
    __m128 tmp;              //local variable to hold temporarily value
    _asm
    {
      mov eax, coef          //move &coef to eax, coef is a pointer as to assembler
      mov edx, this          //though "this" is always stored in ecx for thiscall functions
      //inline makes the call not guaranteed

      //coef to register
      movaps xmm0, [eax]coef.v
      movaps xmm1, [eax]coef.v+16
      movaps xmm2, [eax]coef.v+32
      movaps xmm3, [eax]coef.v+48

      //this to register
      movaps xmm4, [edx]this.v
      movaps xmm5, [edx]this.v+16
      movaps xmm6, [edx]this.v+32
      movaps xmm7, [edx]this.v+48

      //multiply and add          //mulps and addps are executed alternatively to increase parallelity
      mulps xmm4, xmm0
      mulps xmm5, xmm1
      addps xmm4, xmm5
      mulps xmm6, xmm2
      addps xmm4, xmm6
      mulps xmm7, xmm3
      addps xmm4, xmm7

      //register to tmp
      movaps tmp, xmm4
      fld tmp
      fadd tmp[4]
      fadd tmp[8]
      fadd tmp[12]
    }

/*///  Equivalent SSE intrinsic code

__m128 tmp0,tmp1;
__m128 tmp2=coef.v0;
tmp0=_mmul_ps_(*((__m128 *)(v)),tmp2); //*((__m128 *)(coef.v)));
tmp1=_mmul_ps_(*((__m128 *)(v+4)),*((__m128 *)(coef.v+4)));
tmp0=_madd_ps_(tmp0,tmp1);
tmp1=_mmul_ps_(*((__m128 *)(v+8)),*((__m128 *)(coef.v+8)));
tmp0=_madd_ps_(tmp0,tmp1);
tmp1=_mmul_ps_(*((__m128 *)(v+12)),*((__m128 *)(coef.v+12)));
tmp0=_madd_ps_(tmp0,tmp1);
return tmp0.m128_f32[0]+tmp0.m128_f32[1]+tmp0.m128_f32[2]+tmp0.m128_f64[3];
//*/
  }
  inline void Zero()
  {
    /*///  Assembly code
    _asm
    {
      mov edx, this          //though "this" is always stored in ecx for thiscall functions
      //inline makes the call not guaranteed

      //set xmm0 to zero
      xorps xmm0,xmm0

      //copy xmm0 to this
      movaps [edx]this.v, xmm0
      movaps [edx]this.v+16, xmm0
      movaps [edx]this.v+32, xmm0
      movaps [edx]this.v+48, xmm0
    }

/*///  Equivalent SSE intrinsic code

*((__m128 *)(v))=_msetzero_ps_();
*((__m128 *)(v+4))=_msetzero_ps_();
*((__m128 *)(v+8))=_msetzero_ps_();
*((__m128 *)(v+12))=_msetzero_ps_();
//*/
  }
  inline void Dump() const
  {
    printf("%.6f %.6f %.6f %.6f %.6f %.6f %.6f %.6f %.6f %.6f %.6f %.6f %.6f %.6f %.6f %.6f\n",
      v[0],v[1],v[2],v[3],
      v[4],v[5],v[6],v[7],
      v[8],v[9],v[10],v[11],
      v[12],v[13],v[14],v[15]);
  }
  inline float& operator [](int index)
  {
    return v[index];
  }
  inline const float& operator[](int index) const
  {
    return v[index];
  }
  inline void TripleProduct(const SHCoeff4f_SSE &coef)
  {
    TripleProduct4Appr(this,coef);
  }
  void TripleProduct4Appr(SHCoeff4f_SSE *ret,const SHCoeff4f_SSE &coef);

public:
  //constructor and destructor
  SHCoeff4f_SSE(void);
  SHCoeff4f_SSE(const SHCoeff4f_SSE &coef);
  SHCoeff4f_SSE(float *data);
  ~SHCoeff4f_SSE(void);

  //new and delete
  inline void* operator new(size_t size)
  {
    return _aligned_malloc(size,16);
  }
  inline void* operator new(size_t size,void *p)
  {
    return p;
  }
  inline void* operator new[](size_t size)
  {
    return _aligned_malloc(size,16);
  }
  inline void* operator new[](size_t size,void *p)
  {
    return p;
  }
  inline void operator delete(void *p)
  {
    _aligned_free(p);
  }
  inline void operator delete(void *p, void *c)
  {
    return;
  }
  inline void operator delete[](void *p)
  {
    _aligned_free(p);
  }
  inline void operator delete[](void *p,void *c)
  {
    return;
  }

  inline bool operator ==(const SHCoeff4f_SSE& x) const
  {
    return v[0]==x[0] &&\
      v[1]==x[1] &&\
      v[2]==x[2] &&\
      v[3]==x[3] &&\
      v[4]==x[4] &&\
      v[5]==x[5] &&\
      v[6]==x[6] &&\
      v[7]==x[7] &&\
      v[8]==x[8] &&\
      v[9]==x[9] &&\
      v[10]==x[10] &&\
      v[11]==x[11] &&\
      v[12]==x[12] &&\
      v[13]==x[13] &&\
      v[14]==x[14] &&\
      v[15]==x[15];
  }
  inline operator SHCoeff4f()
  {
    SHCoeff4f ret;
    for (int i=0; i<16; i++) ret.v[i]=v[i];
    return ret;
  }
};

}